IMPORT REQUIRED LIBRARIES

In [1]:
# Import required packages

import os
import warnings
import numpy as np
import datetime as dt
import openpyxl
from statsmodels.tsa.seasonal import seasonal_decompose
import statsmodels.api as sm
from scipy import stats
import plotly.graph_objects as go      # pip install plotly, conda install -c plotly plotly=4.8.1
import plotly.io as pio
import plotly.express as px
import plotly.offline as py
from plotly.offline import init_notebook_mode, plot_mpl
from fbprophet.plot import plot_plotly # pip install fbprophet
from fbprophet import Prophet
import pandas as pd
py.init_notebook_mode()
pd.options.plotting.backend = 'plotly'  # change default pandas matplotlib to plotly
warnings.filterwarnings('ignore')

#pretty cell outputs: runs all codes in each cell

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

IMPORT DATA

In [2]:
# Import csv file
rawdf = pd.read_csv('./data/01_rawdata.csv')
In [3]:
rawdf.shape
rawdf.head()
# rawdf.info()
Out[3]:
(26160, 26)
Out[3]:
date zone_id h_0 h_1 h_2 h_3 h_4 h_5 h_6 h_7 ... h_14 h_15 h_16 h_17 h_18 h_19 h_20 h_21 h_22 h_23
0 1/1/04 1 66025.0 64655.0 63898.0 64078.0 65734.0 63360.0 66977.0 68704.0 ... 50102.0 49364.0 51966.0 60480.0 64995.0 65275.0 65167.0 65055.0 65449.0 61156.0
1 1/1/04 2 16908.0 16505.0 16572.0 16928.0 17119.0 17782.0 18629.0 19410.0 ... 13573.0 13193.0 14185.0 16864.0 18205.0 18290.0 17980.0 16959.0 16217.0 14805.0
2 1/1/04 3 126314.0 123368.0 119247.0 117562.0 118398.0 121283.0 126786.0 132344.0 ... 127738.0 126884.0 135218.0 155491.0 157905.0 156895.0 153409.0 147031.0 137768.0 128078.0
3 1/1/04 4 136288.0 133110.0 128663.0 126846.0 127747.0 130860.0 136798.0 142796.0 ... 137825.0 136904.0 145897.0 167770.0 170376.0 169285.0 165525.0 158642.0 148647.0 138193.0
4 1/1/04 5 539.0 512.0 505.0 503.0 499.0 545.0 578.0 636.0 ... 581.0 562.0 615.0 674.0 748.0 718.0 692.0 660.0 605.0 577.0

5 rows × 26 columns

In [4]:
# check for null values
# rawdf.isna().sum()
# rawdf = rawdf.dropna() # to get rid of null values
# rawdf.shape
# # get rid of empty values
# rawdf.dropna(axis=1,how='all',inplace=True)
# rawdf.dropna(axis=0,how='all',inplace=True)
# rawdf.shape

DATA MANIPULATION

Convert date column to datetime format

In [5]:
# convert to datetime format
rawdf['date'] = pd.to_datetime(rawdf['date'])

# convert date as string and split date and time and keep only date
# rawdf.date = rawdf.date.apply(str).str.split(' ').str[0]

Stack hour column

In [6]:
# stacking all hours into date time stamp making univariate dataset
rawdf = pd.melt(
    rawdf,
    id_vars=['date', 'zone_id'],
    value_vars=['h_0', 'h_1', 'h_2', 'h_3', 'h_4', 'h_5', 'h_6',
                'h_7', 'h_8', 'h_9', 'h_10', 'h_11', 'h_12', 'h_13', 'h_14', 'h_15',
                'h_16', 'h_17', 'h_18', 'h_19', 'h_20', 'h_21', 'h_22', 'h_23'],
    var_name='hour',
    value_name='load_value')

# remove h_ from hour values
rawdf.hour = rawdf.hour.str.strip('h_')
rawdf.head()
Out[6]:
date zone_id hour load_value
0 2004-01-01 1 0 66025.0
1 2004-01-01 2 0 16908.0
2 2004-01-01 3 0 126314.0
3 2004-01-01 4 0 136288.0
4 2004-01-01 5 0 539.0

Join date and hour and convert to datetime format and unstack zone

In [7]:
# convert date as string and split date and time and keep only date
# rawdf['ds'] = rawdf.ds.apply(str).str.split(' ').str[0]

rawdf['date'] = pd.to_datetime(
    rawdf.date, errors='coerce') + rawdf.hour.astype('timedelta64[h]')
rawdf.date.head()
Out[7]:
0   2004-01-01
1   2004-01-01
2   2004-01-01
3   2004-01-01
4   2004-01-01
Name: date, dtype: datetime64[ns]
In [8]:
rawdf = rawdf.drop(columns=['hour'])
rawdf.zone_id = rawdf.zone_id.astype('category')  # to reduce size of dataframe
rawdf.head()
Out[8]:
date zone_id load_value
0 2004-01-01 1 66025.0
1 2004-01-01 2 16908.0
2 2004-01-01 3 126314.0
3 2004-01-01 4 136288.0
4 2004-01-01 5 539.0
In [9]:
rawdf.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 627840 entries, 0 to 627839
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   date        627840 non-null  datetime64[ns]
 1   zone_id     627840 non-null  category      
 2   load_value  627840 non-null  float64       
dtypes: category(1), datetime64[ns](1), float64(1)
memory usage: 10.2 MB
In [10]:
# keep zones as columns
rawdf = rawdf.pivot_table(
    values='load_value',
    index='date',
    columns='zone_id',
    dropna=False
)
# list comprehension method to add prefix zone_ in columns
rawdf.columns = ['zone_' + str(col) for col in rawdf.columns]
rawdf.head(2).append(rawdf.tail(2))
rawdf.shape
Out[10]:
zone_1 zone_2 zone_3 zone_4 zone_5 zone_6 zone_7 zone_8 zone_9 zone_10 zone_11 zone_12 zone_13 zone_14 zone_15 zone_16
date
2004-01-01 00:00:00 66025.0 16908.0 126314.0 136288.0 539.0 6884.0 133143.0 136288.0 3179.0 75298.0 23394.0 90755.0 118433.0 20728.0 21846.0 976022.0
2004-01-01 01:00:00 64655.0 16505.0 123368.0 133110.0 512.0 6651.0 129964.0 133110.0 3011.0 67423.0 22155.0 86754.0 112535.0 19721.0 21455.0 940929.0
2008-06-22 22:00:00 58904.0 19096.0 158621.0 171148.0 471.0 6741.0 165307.0 171148.0 3243.0 48544.0 76405.0 123623.0 166911.0 17927.0 24749.0 1212838.0
2008-06-22 23:00:00 51230.0 15415.0 141483.0 152657.0 408.0 5555.0 146983.0 152657.0 2782.0 58750.0 68912.0 106253.0 143352.0 15485.0 20684.0 1082606.0
Out[10]:
(39240, 16)

VISUALIZATION

Let’s plot hourly data for each column of the dataset

In [11]:
# Set notebook mode to work in offline
py.init_notebook_mode()

fig=rawdf.iloc[:,0:1].plot(template="ggplot2")    # change pandas backend to plotly to replace matplotlib
fig = fig.update_layout(
    yaxis_title="<b>Load value (in kW)</b>",
    title="<b>Energy load demand </b>",
    title_x=0.5, 
    xaxis_title= "<b>Date</b>")
fig

RESAMPLING

Convert to daily data if modeling is done on daily rather than hourly

In [12]:
# take daily data
rawdf_day = rawdf.resample('D').sum()
rawdf_day.head(2).append(rawdf_day.tail(2))
rawdf_day.shape
Out[12]:
zone_1 zone_2 zone_3 zone_4 zone_5 zone_6 zone_7 zone_8 zone_9 zone_10 zone_11 zone_12 zone_13 zone_14 zone_15 zone_16
date
2004-01-01 1486969.0 402200.0 3258627.0 3515963.0 14661.0 189403.0 3446711.0 3515963.0 91890.0 1659438.0 553697.0 2302100.0 3006189.0 524242.0 518939.0 24486992.0
2004-01-02 1426468.0 409477.0 3698050.0 3990102.0 14196.0 172139.0 3868870.0 3990102.0 86195.0 1706604.0 613061.0 2265568.0 2952445.0 488337.0 467138.0 26148752.0
2008-06-21 1387411.0 460203.0 3866585.0 4171952.0 11658.0 164205.0 4029471.0 4171952.0 81848.0 1538079.0 1910496.0 3007018.0 3884032.0 412492.0 519430.0 29616832.0
2008-06-22 1433686.0 443667.0 3811310.0 4112307.0 11593.0 175289.0 3985279.0 4112307.0 83628.0 1594905.0 1833609.0 2972209.0 3952427.0 427831.0 581157.0 29531204.0
Out[12]:
(1635, 16)
In [13]:
rawdf_day.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1635 entries, 2004-01-01 to 2008-06-22
Freq: D
Data columns (total 16 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   zone_1   1635 non-null   float64
 1   zone_2   1635 non-null   float64
 2   zone_3   1635 non-null   float64
 3   zone_4   1635 non-null   float64
 4   zone_5   1635 non-null   float64
 5   zone_6   1635 non-null   float64
 6   zone_7   1635 non-null   float64
 7   zone_8   1635 non-null   float64
 8   zone_9   1635 non-null   float64
 9   zone_10  1635 non-null   float64
 10  zone_11  1635 non-null   float64
 11  zone_12  1635 non-null   float64
 12  zone_13  1635 non-null   float64
 13  zone_14  1635 non-null   float64
 14  zone_15  1635 non-null   float64
 15  zone_16  1635 non-null   float64
dtypes: float64(16)
memory usage: 217.1 KB

The plot shows seasonality.

Take subset of dataframe choosing daily data for 5 zones

In [14]:
# Lets use only 5 zones
# data = rawdf.iloc[:, 0:5]           # hourly data
data = rawdf_day.iloc[:, 0:5]       # daily data
data.head(2).append(data.tail(2))
data.shape
Out[14]:
zone_1 zone_2 zone_3 zone_4 zone_5
date
2004-01-01 1486969.0 402200.0 3258627.0 3515963.0 14661.0
2004-01-02 1426468.0 409477.0 3698050.0 3990102.0 14196.0
2008-06-21 1387411.0 460203.0 3866585.0 4171952.0 11658.0
2008-06-22 1433686.0 443667.0 3811310.0 4112307.0 11593.0
Out[14]:
(1635, 5)

Time series plot for zone 1 (daily data)

In [15]:
# Set notebook mode to work in offline
py.init_notebook_mode()

# create output directory

if not os.path.exists("images"):
    os.mkdir("images")
# define a function to plot graph

def plot_tsplot(col):
    fig = go.Figure()
    fig = fig.add_trace(go.Scatter(
        x=data.index, y=data[col], mode='lines'))  # mode='lines+markers'
    fig = fig.update_layout(
        width = 1000, height = 600,
        title=dict(text="<b>Timeseries plot for daily demand for energy for "+col+"</b>",
                   y=0.9, x=0.5, font_size=18),
        # hovermode='x',
        xaxis=dict(
            title="<b>Date</b>", linecolor='black', linewidth=1,
            rangeslider_visible=True,
            rangeselector=dict(
                buttons=list(
                    [dict(count=7, label="1w", step="day", stepmode="backward"),
                     dict(count=1, label="1m", step="month", stepmode="backward"),
                     dict(count=6, label="6m", step="month", stepmode="backward"),
                     dict(count=1, label="1y", step="year", stepmode="backward"),
                     dict(label='full', step="all")
                     ])
            ),
            ticks="outside"
        ),
        yaxis=dict(title="<b>Load demand (in kW)</b>",
                   linecolor='black', linewidth=1,  ticks="outside"),
        font=dict(family="Helvetica", size=12, color="black"),
        paper_bgcolor="white",
        template="plotly",
        # width=600, height=400
        # legend=dict(
        #     title="Type", orientation="h", x=0.25, y=-0.6,
        #     bgcolor="blue", bordercolor="black", borderwidth=1
        # )
    )
    fig.write_html("images/"+col+"_EDA_tstplot"".html")       # write_image for other formats
    fig.show()

# display for only 1 plot, change as required
for col in data.columns[0:1]:
    plot_tsplot(col)
In [16]:
# plot method 2: with backend plotly and using ggplot2 theme

# for colnum in range(0, 5):
#     data.iloc[:, colnum].plot(template="ggplot2").show()

# fig.update_layout(
#     yaxis_title="<b>Load value (in kW)</b>",
#     title="<b>Energy load demand </b>",
#     title_x=0.5, 
#     xaxis_title= "<b>Date</b>")

EXPORT CSV FILE

In [17]:
# Export as csv file with 5 zones

data.to_csv("./data/02_clean_data.csv", index=True)
In [18]:
# TO RUN LOOP FOR ALL DATAFRAME and columns, DO STH LIKE THIS

# def eachzone(col):
#     # ALL YOUR ACTIONS
#     print(data.columns)

# for col in data:
#     eachzone(col)

MODELING USING FBPROPHET

Creating the data set for Prophet

The ds column should be YYYY-MM-DD for a date, or YYYY-MM-DD HH:MM:SS for a timestamp.

In [19]:
# Import holidays csv file
holidays = pd.read_csv('./data/holidays.csv')
holidays.ds=pd.to_datetime(holidays.ds)

Filter by each zone

In [20]:
# data.columns
# ['zone_1', 'zone_2', 'zone_3', 'zone_4', 'zone_5']
col = 'zone_1'
df = data[[col]].reset_index()  # select each zone and reset index
# Prophet requires date = ds, feature = y
df.columns = ['ds', 'y']
df.head(2).append(df.tail(2))
Out[20]:
ds y
0 2004-01-01 1486969.0
1 2004-01-02 1426468.0
1633 2008-06-21 1387411.0
1634 2008-06-22 1433686.0

Split data into training and test set (80-20)

First 80% of data are taken as training and remaining 20% as test dataset

In [21]:
# # split data into training and validation set
training_size = int(len(df)*0.8)
print(training_size)

train_df, test_df = df[: training_size], df[training_size:]
print(len(train_df)*100/(len(train_df)+len(test_df)))
1308
80.0

Fit Prophet Model

Prophet will by default fit weekly and yearly seasonalities if the time series is more than two cycles long. It will also fit daily seasonality for a sub-daily time series. You can add other seasonalities (monthly, quarterly, hourly)if required.

Fit basic model

In [22]:
# # # fit a prophet model on training dataset

# model = Prophet()        
# model.fit(train_df)

Check forecasting errors from basic model and retweak parameters in the model

In [23]:
# check EDA for monthly and quarterly trend or other and add seasonality based on that
# Do not run loop randomly for optimizing model
# HOliday --do not use for hourly data, holidays_prior_scale = 20-40, try big if holidays have high impact, default=10
# growth = linear default, if curve then logistic, 
# fourier: try 10-25 (higher normally gives better result)
# period = 30.5 --> what happened at a certain point is likely to happen again in 35 days.
# seasonality_prior_scale: 10-25
# n_changepoints: sudden/abrupt change in trend, let prophet find itself and then add..
In [24]:
model = Prophet(
    growth='linear',
#     holidays=holidays,           
    seasonality_mode='additive',
#     n_changepoints = 100,          # default = 25 
    changepoint_prior_scale=0.05, # default = 0.05
#     seasonality_prior_scale=15,
#     holidays_prior_scale=10,      # default = 10
    daily_seasonality=False,
    weekly_seasonality=False,
    yearly_seasonality=False
#     ).add_seasonality(name="monthly", period=30.5, fourier_order=6
    # ).add_seasonality(name="daily", period=1,fourier_order=15
    ).add_seasonality(name="weekly", period=7,fourier_order=3
#     ).add_seasonality(name="6month", period=30.5*6,fourier_order=4
    ).add_seasonality(name="yearly", period=365.25,fourier_order=10
    # ).add_seasonality(name="quarterly", period=365.25/4, fourier_order=5, prior_scale=15
    )
model.fit(train_df)
Out[24]:
<fbprophet.forecaster.Prophet at 0x1a2b85a590>

Predicting the values for the future

Create a dataframe with ds(datetime stamp) containing the dates for which we want to make the predictions.

In [25]:
# Add dataframe including test size, By default it includes dates from the history
# specify frequency as H if hourly and D if daily data
future = model.make_future_dataframe(periods=len(test_df), freq='D') # periods=300, freq='H', 'month'

# Predict for train/test dataset
forecast = model.predict(future)
# forecast1.tail().T # transpose dataframe
# forecast[(forecast.ds > "2008-6-22")].head().T
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()
Out[25]:
ds yhat yhat_lower yhat_upper
1630 2008-06-18 1.519599e+06 1.275701e+06 1.740611e+06
1631 2008-06-19 1.529604e+06 1.292233e+06 1.758021e+06
1632 2008-06-20 1.536523e+06 1.317767e+06 1.760446e+06
1633 2008-06-21 1.570715e+06 1.335231e+06 1.808145e+06
1634 2008-06-22 1.555614e+06 1.329260e+06 1.791106e+06

Create a full dataframe matching the index

In [26]:
# define a funtion to create dataframe containing prediction and actual values
def actual_pred_dataframe(historical, forecast):
    return forecast.set_index('ds')[
        ['yhat', 'yhat_lower', 'yhat_upper']].join(historical.set_index('ds'))

cmp_df = actual_pred_dataframe(df, forecast)
cmp_df.head(2).append(cmp_df.tail(2))
cmp_df.to_csv("./data/03_cmp_df.csv", index=True)        # Export as csv file
Out[26]:
yhat yhat_lower yhat_upper y
ds
2004-01-01 1.526063e+06 1.309019e+06 1.765743e+06 1486969.0
2004-01-02 1.516805e+06 1.296289e+06 1.758293e+06 1426468.0
2008-06-21 1.570715e+06 1.335231e+06 1.808145e+06 1387411.0
2008-06-22 1.555614e+06 1.329260e+06 1.791106e+06 1433686.0

Evaluate performance of model using MAPE and MAE

MAPE: mean absolute percentage error
MAE: mean absolute error

In [27]:
# define function to calculate MAPE and MAE

def calculate_forecast_errors(dataframe, training_size):
    dataframe = dataframe.copy()

    dataframe['e'] = dataframe['y'] - dataframe['yhat']
    dataframe['p'] = 100 * dataframe['e'] / dataframe['y']

    predicted_part = dataframe[training_size:]              # test data part

    def error_mean(error_name):
        return np.mean(abs(predicted_part[error_name])).round(2)

    return {'MAPE': error_mean('p'), 'MAE': error_mean('e')}

# Print MAPE and MAE

for error_name, err_value in calculate_forecast_errors(cmp_df, training_size).items():
    print(error_name, err_value)
MAPE 11.38
MAE 186657.72

Observation of errors

  • Basic model: MAPE 11.4; MAE 187042.95

  • Tweaked model: MAPE 11.38; MAE 186657.72

Lets choose model with tweaked parameters

CREATE VISUALIZATIONS

Component plot (training data)

In [28]:
# display forecast components Prophet.plot_components method
# trend, yearly seasonality, and weekly seasonality of the time series
# include holiday to see holidays

component_plot = model.plot_components(forecast)
component_plot.savefig("images/"+col+"_component_plot_train"".pdf")  # pandas---savefig, write_html

# py.init_notebook_mode(connected=True)   # connection=false,default,requires internet, file size high
# convert the plot into interactive one
# component_plot = plot_mpl(component_plot, filename="images/component_plot_train.html", auto_open=True)        #opens graph in html in brower

Observations from component plot for zone 1

  • Increased trend was observed in 2005. After which trend is going down. It was during the great recession, a global economic downturn that devastated word financial markets as well as the banking and real estate industries.
  • Weekly trend shows higher demand on monday, tuesday and saturday
  • Yearly trend shows higher demand during January-February (winter season) and July-August (summer season).

Actual vs predicted plot (train dataset with prediction for test dataset-date)

In [29]:
py.init_notebook_mode()              # make it true if you want reduce file, but you need to be online
forecast_plot = plot_plotly(model, forecast)  # This returns a plotly Figure
forecast_plot.write_html("images/forecast_plot_traintest.html", auto_open=False) # turn it to true if you want to open it on own.show
forecast_plot.show()

Alternative graph: Training and Test Dataset with prediction by model

In [30]:
# Set notebook mode to work in offline
py.init_notebook_mode()

fig = go.Figure()
fig = fig.add_trace(go.Scatter(
    x=cmp_df.index, y=cmp_df.y, mode='markers', name='actual',line_color='rgb(0,100,80)'))
fig = fig.add_trace(go.Scatter(
    x=cmp_df.index, y=cmp_df.yhat, mode='lines', name='predicted', line_color='rgb(231,107,243)'))
fig = fig.update_layout(
    width=1000, height=600,
    title=dict(text="<b>Prophet model train test</b>", y=0.9, x=0.5, font_size=18),
    xaxis=dict(title="<b>Date</b>", linecolor='black', linewidth=1,
        rangeselector=dict(
            buttons=list(
                [dict(count=7, label="1w", step="day",
                        stepmode="backward"),
                    dict(count=1, label="1m", step="month",
                        stepmode="backward"),
                    dict(count=6, label="6m", step="month",
                        stepmode="backward"),
                    dict(count=1, label="1y", step="year",
                        stepmode="backward"),
                    dict(label='full', step="all")
                    ])
    )),
    yaxis=dict(title="<b>Load demand (in kW)</b>", linecolor='black', linewidth=1),
    font=dict(family="Helvetica", size=12, color="black"),
    paper_bgcolor="white"
    )

fig = fig.add_shape(
        # filled Rectangle
            type="rect",
            x0="2007-08-01", y0=0,
            x1="2008-06-22", y1=3000000,
            line=dict(
                color="black",
                width=1,
            ),
            fillcolor="red", opacity=0.09, layer="below", line_width=0
        )

fig.write_html("images/"+col+"_Train_Test_actualvspred"".html")       # write_image for other formats
fig.show()

Alternative way to get forecasting errors

In [31]:
# test_forecast = forecast[training_size:]

# from math import sqrt
# from sklearn.metrics import mean_squared_error
# def mean_absolute_percentage_error(y_true, y_pred):
#     return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# def errors(y_true,y_pred):
#     rmse = sqrt(mean_squared_error(y_true, y_pred))
#     mape = mean_absolute_percentage_error(y_true, y_pred)
#     return{'MAPE':mape,
#            'RMSE':rmse}

# errors(test_df.y, test_forecast.yhat)

RETAIN MODEL AND FIT TO FULL DATASET

In [32]:
# # fit prophet model with similar parameters on full dataset
model2 = Prophet(
    growth='linear',       
    seasonality_mode='additive',
    changepoint_prior_scale=0.05, # default = 0.05
    daily_seasonality=False,
    weekly_seasonality=False,
    yearly_seasonality=False
    ).add_seasonality(name="weekly", period=7,fourier_order=3
    ).add_seasonality(name="yearly", period=365.25,fourier_order=10
    )
model2.fit(df)
Out[32]:
<fbprophet.forecaster.Prophet at 0x1a2680cb10>
In [33]:
future2 = model2.make_future_dataframe(periods=30, freq='D') # 24*7 days for hourly data

# Predict for train/test dataset
forecast2 = model2.predict(future2)

# forecast2[(forecast2.ds > "2008-6-23")]

Create a dataframe for forecast period only

In [34]:
# just the forecasted part not the whole dataset
final_forecast = forecast2.set_index('ds')
final_forecast = final_forecast[['yhat', 'yhat_lower', 'yhat_upper']].tail(30)
final_forecast.to_csv("./data/04_final_forecast.csv", index=True)        # Export as csv file

CREATE FINAL VISUALIZATIONS:

Actual vs predicted for train,test along with future prediction

In [35]:
cmp_df.head(2).append(cmp_df.tail(2))                   # cmp_df: full dataset with actual and prediction
final_forecast.head(2).append(final_forecast.tail(2))   # final_forecast: future predicted dataset y-hat and confidence interval
Out[35]:
yhat yhat_lower yhat_upper y
ds
2004-01-01 1.526063e+06 1.309019e+06 1.765743e+06 1486969.0
2004-01-02 1.516805e+06 1.296289e+06 1.758293e+06 1426468.0
2008-06-21 1.570715e+06 1.335231e+06 1.808145e+06 1387411.0
2008-06-22 1.555614e+06 1.329260e+06 1.791106e+06 1433686.0
Out[35]:
yhat yhat_lower yhat_upper
ds
2008-06-23 1.655380e+06 1.416241e+06 1.900012e+06
2008-06-24 1.657490e+06 1.407526e+06 1.891466e+06
2008-07-21 1.831564e+06 1.596367e+06 2.087789e+06
2008-07-22 1.830074e+06 1.592948e+06 2.062949e+06

Create upper and lower limit values for forecast in plotting a graph

In [36]:
# for final_forecast upper and lower limit
x3 = final_forecast.index
x3_rev = x3[::-1]  # make reverse

y3 = final_forecast.yhat
y3_upper = final_forecast.yhat_upper
y3_lower = final_forecast.yhat_lower
y3_lower = y3_lower[::-1]

Forecast plot (final dataset)

In [37]:
# Set notebook mode to work in offline
py.init_notebook_mode()

# create figure

fig = go.Figure()

# Add line 1
fig = fig.add_trace(go.Scatter(
    x=cmp_df.index, y=cmp_df.y, mode='lines', name='actual', line_color='rgb(0,100,80)'))
# Add line 2
fig = fig.add_trace(go.Scatter(
    x=cmp_df.index, y=cmp_df.yhat, mode='lines', name='predicted', line_color='rgb(231,107,243)'))


# Add lower and upper limit area for line 3
fig = fig.add_trace(go.Scatter(
    x=x3.append(x3_rev),  # are dataframe, so append
    y=y3_upper.append(y3_lower),
    fill='toself', fillcolor='rgba(0,176,246,0.2)', line_color='rgba(255,255,255,0)',
    name='future_prediction', showlegend=False))
# Add line 3
fig = fig.add_trace(go.Scatter(
    x=final_forecast.index, y=final_forecast.yhat, mode='lines', name='future_prediction', line_color='rgb(0,176,246)'))

# update layout
fig = fig.update_layout(
    width=1000, height=600,
    title=dict(text="<b>Prophet model train test</b>",
               y=0.9, x=0.5, font_size=18),
    xaxis=dict(title="<b>Date</b>", linecolor='black', linewidth=1, ticks="outside",
                   rangeslider_visible=True,
    rangeselector=dict(
        buttons=list(
            [dict(count=7, label="1w", step="day",
                    stepmode="backward"),
                dict(count=1, label="1m", step="month",
                    stepmode="backward"),
                dict(count=6, label="6m", step="month",
                    stepmode="backward"),
                dict(count=1, label="1y", step="year",
                    stepmode="backward"),
                dict(label='full', step="all")
                ])
    )),
    yaxis=dict(title="<b>Load demand (in kW)</b>",
               linecolor='black', linewidth=1, ticks="outside"),
    font=dict(family="Helvetica", size=12, color="black"),
    paper_bgcolor="white")

fig = fig.add_shape(
        # filled Rectangle
            type="rect",
            x0="2007-08-01", y0=0,
            x1="2008-06-22", y1=3000000,
            line=dict(
                color="black",
                width=1,
            ),
            fillcolor="red", opacity=0.09, layer="below", line_width=0
        )

fig.write_html("images/"+col+"_final_graph"".html")       # write_image for other formats
fig.show()

Alternate graph: Final forecast dataset

In [38]:
# Set notebook mode to work in offline
py.init_notebook_mode()

forecast_plot = plot_plotly(model2, forecast2)  # This returns a plotly Figure
forecast_plot.write_html("images/"+col+"_final_graph_2"".html", auto_open=False)
forecast_plot.show()

Component plot for full dataset

In [39]:
# display forecast components Prophet.plot_components method
# trend, yearly seasonality, and weekly seasonality of the time series
# include holiday to see holidays

# interactive plots
py.init_notebook_mode(connected=False)
component_plot = model2.plot_components(forecast2)
component_plot.savefig("images/"+col+"_component_plot_full"".pdf")  # pandas---savefig, write_html

WHAT CAN BE DONE?

  • Only historical data for energy load has been used. Inclusion of other factor such as temperature will improve the error.
  • Retweak parameters in fbprophet model.
  • Cross Validation
  • Combining multiple forecast
  • Dynamic Time series considering 2008's The Great Recession.
In [ ]: